library(tidyverse)
library(readr)
library(modelr)
day <- read_csv("./day.csv")
temp_orig, feel_temp_orig, hum_orig, windspeed_orig.day <- (day %>%
mutate(temp_orig = temp * 39,
feel_temp_orig = atemp * 50,
hum_orig = hum * 100,
windspeed_org = windspeed * 67))
head(day)
## # A tibble: 6 × 20
## instant dteday season yr mnth holiday weekday workingday weathersit
## <dbl> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 2011-01-01 1 0 1 0 6 0 2
## 2 2 2011-01-02 1 0 1 0 0 0 2
## 3 3 2011-01-03 1 0 1 0 1 1 1
## 4 4 2011-01-04 1 0 1 0 2 1 1
## 5 5 2011-01-05 1 0 1 0 3 1 1
## 6 6 2011-01-06 1 0 1 0 4 1 1
## # … with 11 more variables: temp <dbl>, atemp <dbl>, hum <dbl>,
## # windspeed <dbl>, casual <dbl>, registered <dbl>, cnt <dbl>,
## # temp_orig <dbl>, feel_temp_orig <dbl>, hum_orig <dbl>, windspeed_org <dbl>
weathersit variable to be a factor with the values “clear”, “mist”, “light precip”, and “heavy precip”, based on the variable definitions. This factor should have the ordering “clear”, “mist”, “light precip”, “heavy precip”.day$weathersit <- factor(day$weathersit,
levels = c(1, 2, 3, 4),
labels = c('clear', 'mist', 'light precip', 'heavy precip'))
head(day)
## # A tibble: 6 × 20
## instant dteday season yr mnth holiday weekday workingday weathersit
## <dbl> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fct>
## 1 1 2011-01-01 1 0 1 0 6 0 mist
## 2 2 2011-01-02 1 0 1 0 0 0 mist
## 3 3 2011-01-03 1 0 1 0 1 1 clear
## 4 4 2011-01-04 1 0 1 0 2 1 clear
## 5 5 2011-01-05 1 0 1 0 3 1 clear
## 6 6 2011-01-06 1 0 1 0 4 1 clear
## # … with 11 more variables: temp <dbl>, atemp <dbl>, hum <dbl>,
## # windspeed <dbl>, casual <dbl>, registered <dbl>, cnt <dbl>,
## # temp_orig <dbl>, feel_temp_orig <dbl>, hum_orig <dbl>, windspeed_org <dbl>
day$workingday <- factor(day$workingday,
levels = c(1, 0),
labels = c('work day', 'weekend/holiday'))
head(day)
## # A tibble: 6 × 20
## instant dteday season yr mnth holiday weekday workingday weathersit
## <dbl> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <fct>
## 1 1 2011-01-01 1 0 1 0 6 weekend/holi… mist
## 2 2 2011-01-02 1 0 1 0 0 weekend/holi… mist
## 3 3 2011-01-03 1 0 1 0 1 work day clear
## 4 4 2011-01-04 1 0 1 0 2 work day clear
## 5 5 2011-01-05 1 0 1 0 3 work day clear
## 6 6 2011-01-06 1 0 1 0 4 work day clear
## # … with 11 more variables: temp <dbl>, atemp <dbl>, hum <dbl>,
## # windspeed <dbl>, casual <dbl>, registered <dbl>, cnt <dbl>,
## # temp_orig <dbl>, feel_temp_orig <dbl>, hum_orig <dbl>, windspeed_org <dbl>
day$yr <- factor(day$yr,
levels = 0:1,
labels = c('2011', '2012'))
head(day)
## # A tibble: 6 × 20
## instant dteday season yr mnth holiday weekday workingday weathersit
## <dbl> <date> <dbl> <fct> <dbl> <dbl> <dbl> <fct> <fct>
## 1 1 2011-01-01 1 2011 1 0 6 weekend/holi… mist
## 2 2 2011-01-02 1 2011 1 0 0 weekend/holi… mist
## 3 3 2011-01-03 1 2011 1 0 1 work day clear
## 4 4 2011-01-04 1 2011 1 0 2 work day clear
## 5 5 2011-01-05 1 2011 1 0 3 work day clear
## 6 6 2011-01-06 1 2011 1 0 4 work day clear
## # … with 11 more variables: temp <dbl>, atemp <dbl>, hum <dbl>,
## # windspeed <dbl>, casual <dbl>, registered <dbl>, cnt <dbl>,
## # temp_orig <dbl>, feel_temp_orig <dbl>, hum_orig <dbl>, windspeed_org <dbl>
dteday to be recognized as dates by R.# dteday is already recognized as a date.
head(day)
## # A tibble: 6 × 20
## instant dteday season yr mnth holiday weekday workingday weathersit
## <dbl> <date> <dbl> <fct> <dbl> <dbl> <dbl> <fct> <fct>
## 1 1 2011-01-01 1 2011 1 0 6 weekend/holi… mist
## 2 2 2011-01-02 1 2011 1 0 0 weekend/holi… mist
## 3 3 2011-01-03 1 2011 1 0 1 work day clear
## 4 4 2011-01-04 1 2011 1 0 2 work day clear
## 5 5 2011-01-05 1 2011 1 0 3 work day clear
## 6 6 2011-01-06 1 2011 1 0 4 work day clear
## # … with 11 more variables: temp <dbl>, atemp <dbl>, hum <dbl>,
## # windspeed <dbl>, casual <dbl>, registered <dbl>, cnt <dbl>,
## # temp_orig <dbl>, feel_temp_orig <dbl>, hum_orig <dbl>, windspeed_org <dbl>
dteday to create a new variable, days, that is the number of days since January 1, 2011. Note that you can subtract two dates. Keep in mind that the value stored in days should be stored as a number.day <- (day %>%
mutate(days = as.numeric(day$dteday - as.Date("11-01-01", "%y-%m-%d"))))
head(day)
## # A tibble: 6 × 21
## instant dteday season yr mnth holiday weekday workingday weathersit
## <dbl> <date> <dbl> <fct> <dbl> <dbl> <dbl> <fct> <fct>
## 1 1 2011-01-01 1 2011 1 0 6 weekend/holi… mist
## 2 2 2011-01-02 1 2011 1 0 0 weekend/holi… mist
## 3 3 2011-01-03 1 2011 1 0 1 work day clear
## 4 4 2011-01-04 1 2011 1 0 2 work day clear
## 5 5 2011-01-05 1 2011 1 0 3 work day clear
## 6 6 2011-01-06 1 2011 1 0 4 work day clear
## # … with 12 more variables: temp <dbl>, atemp <dbl>, hum <dbl>,
## # windspeed <dbl>, casual <dbl>, registered <dbl>, cnt <dbl>,
## # temp_orig <dbl>, feel_temp_orig <dbl>, hum_orig <dbl>, windspeed_org <dbl>,
## # days <dbl>
Now you’ll investigate different models predicting relationships between the number of rentals (cnt) and the temperature (temp_orig).
ggplot(day) +
geom_point(aes(x = temp_orig, y = cnt, col = temp)) +
labs(x = 'Bike rentals',
y = 'Temperature in Celcius',
title = 'Bike Rentals in DC in 2011 and 2012',
subtitle = 'Warmer weather leads to more rentals')
temp_orig and cnt variables from your modified bikes data frame.bt <- data.frame(day['temp_orig'], day['cnt'])
head(bt)
## temp_orig cnt
## 1 13.422513 985
## 2 14.175642 801
## 3 7.658196 1349
## 4 7.800000 1562
## 5 8.851323 1600
## 6 7.969572 1606
cnt, is the dependent (response) variable and the original temperature in Celsius, temp_orig, is the independent (predictor) variable. Save these with different names.# Linear
line_day <- lm(cnt ~ temp_orig, data = bt)
line_coe <- coefficients(line_day)
# Quadratic
bt2 <- (bt %>%
mutate(tsqrt = temp_orig ** 2))
quad_day <- model_matrix(cnt ~ temp_orig + tsqrt, data = bt2)
pred <- lm(cnt ~ temp_orig + tsqrt, data = bt2)
quad_day <- quad_day %>%
add_predictions(pred)
grid data frame with values of temp_orig (cut into 20 evenly spaced values) and separately add the predictions from both models to this data frame. NOTE: Do not use gather_predictions here, as Dr. McNelis wants the model predictions to be in two different columns. The predictions from the linear model should be labeled “linpred” and those from the quadratic model labelled as “quadpred”. Note, the add_predictions function allows you to specify the name of variable containing the predictions.temp_grid <- (bt %>%
data_grid(temp_orig = seq_range(temp_orig, 20)))
temp_grid <- (temp_grid %>%
add_predictions(line_day, var = 'linpred'))
temp_grid <- (temp_grid %>%
mutate(tsqrt = temp_orig ^ 2) %>%
add_predictions(pred, var = 'quadpred') %>%
select(-tsqrt))
temp_grid
## # A tibble: 20 × 3
## temp_orig linpred quadpred
## <dbl> <dbl> <dbl>
## 1 2.31 1607. -689.
## 2 3.95 1888. 113.
## 3 5.60 2168. 862.
## 4 7.25 2449. 1556.
## 5 8.90 2729. 2197.
## 6 10.5 3010. 2785.
## 7 12.2 3290. 3318.
## 8 13.8 3571. 3798.
## 9 15.5 3851. 4224.
## 10 17.1 4132. 4597.
## 11 18.8 4412. 4915.
## 12 20.4 4693. 5180.
## 13 22.1 4973. 5391.
## 14 23.7 5254. 5549.
## 15 25.4 5534. 5653.
## 16 27.0 5815. 5703.
## 17 28.7 6095. 5699.
## 18 30.3 6376. 5642.
## 19 32.0 6656. 5531.
## 20 33.6 6937. 5366.
temp_orig versus cnt and add curves for your linear and quadratic models from your grid data frame. Have a nice title, subtitle, and axes labels on your graph.ggplot(day, aes(x = temp_orig, y = cnt, col = temp)) +
geom_point(alpha = .3) +
geom_line(data = temp_grid, aes(x = temp_orig, y = quadpred), color = "red") +
geom_line(data = temp_grid, aes(x = temp_orig, y = linpred), color = 'blue') +
labs(x = 'Temperature in Celcius',
y = 'Bike rentals',
title = "Comparing Models",
subtitle = "Blue is linear and red is quadratic.")
bt <- (bt %>%
add_residuals(line_day, var = 'line') %>%
mutate(tsqrt = temp_orig ^ 2) %>%
add_residuals(pred, var = 'quad') %>%
select(-tsqrt))
bt_long <- (bt %>%
pivot_longer(cols = c('line', 'quad'),
names_to = 'model',
values_to = 'resids'))
bt_long
## # A tibble: 1,462 × 4
## temp_orig cnt model resids
## <dbl> <dbl> <chr> <dbl>
## 1 13.4 985 line -2515.
## 2 13.4 985 quad -2697.
## 3 14.2 801 line -2827.
## 4 14.2 801 quad -3089.
## 5 7.66 1349 line -1170.
## 6 7.66 1349 quad -372.
## 7 7.8 1562 line -981.
## 8 7.8 1562 quad -215.
## 9 8.85 1600 line -1122.
## 10 8.85 1600 quad -581.
## # … with 1,452 more rows
ggplot(bt_long) +
geom_point(aes(x = temp_orig, y = resids, col = model)) +
geom_ref_line(h = 0) +
facet_grid(~model) +
labs(x = 'Temperature in Celcius',
y = 'Residual',
title = "Residuals of the linear and quadratic models",
subtitle = "Blue is linear and red is quadratic.") +
theme(legend.position = 'none')
Overall the quadratic model seems like it represents the data better. The amount of rentals flattens out and decreases a little after 20 degrees, the linear model does not capture that at all. The linear model’s residuals also creates a pretty defined quadratic curve, whereas the quadratic model’s residuals are distributed better.
Now you’ll investigate different models predicting relationships between the number of rentals (cnt) and time, in terms of the number of days since January 1, 2011 (days).
ggplot(day) +
geom_point(aes(x = days, y = cnt, col = temp_orig)) +
labs(x = 'Days since January 1, 2011',
y = 'Bike rentrals',
title = 'Bike Rentals in DC, 2011 and 2012',
subtitle = 'Rentals trends over time')
bd (for bike days) that contains the days and cnt variables from your modified bikes data frame.bd <- data.frame(day['days'], day['cnt'])
fit_order_n that takes an input of an integer n and returns a scatter plot of our data (like that in problem 1) with our nth order polynomial plotted along with the data. Make sure that the graph includes a TITLE that indicates the order of the polynomial (hint: check out help on the paste function).fit_order_n <- function(n) {
fit <- lm(cnt ~ poly(days, degree = n), data = bd)
p_grid <- data_grid(bd, days)
p_grid <- (bd %>%
gather_predictions(fit))
ggplot(p_grid) +
geom_point(aes(x = days, y = cnt), alpha = 0.2) +
geom_line(aes(x = days, y = pred), col = 'red') +
labs(x = 'Days since January 1, 2011',
y = 'Bike rentrals',
title = 'Estimating Bike Rentals in DC',
subtitle = paste('using ', n,
ifelse(n == 1, 'st',
ifelse(n == 2, 'nd',
ifelse(n == 3, 'rd', 'th'))),
' order polynomial', sep = ''))
}
# I couldn't get a for loop to work
fit_order_n(1)
fit_order_n(2)
fit_order_n(3)
fit_order_n(4)
fit_order_n(5)
fit_order_n(6)
fit_order_n(7)
fit_order_n(8)
fit_order_n(9)
fit_order_n(10)
Using a 4th order polynomial seems to be the minimum that has the same shape as the data data, but using a 6th order polynomial seems to be a substantial improvement over 4th and 5th, and then more or less equal to higher order polynomials.
B = 2 * pi / 365.25
mod3 <- lm(cnt ~ sin(B * days) + cos(B * days), data = day)
coe <- coefficients(mod3)
phi <- atan(coe[[3]] / coe[[2]])
A <- coe[[2]] / cos(phi)
k = coe[[1]]
grid data frame with values of days, then use mutate to add a new column to grid called “sinepred” whose values are the values of A sin(B ∗ days + φ) + k.sine_grid <- (day %>%
data_grid(days) %>%
mutate(sinepred = A * sin(B * days + phi) + k))
ggplot(day) +
geom_point(aes(x = days, y = cnt, col = temp_orig), alpha = 0.3) +
geom_line(data = sine_grid, aes(x = days, y = sinepred), col = 'red') +
labs(x = 'Days since January 1, 2011',
y = 'Bike rentrals',
title = 'Estimating Bike Rentals in DC',
subtitle = 'Using a sinusoidal model')
bd2 <- (bd %>%
gather_residuals(mod3))
ggplot(bd2) +
geom_point(aes(x = days, y = resid), alpha = 0.5) +
geom_ref_line(h = 0) +
labs(x = 'Days since January 01, 2011',
y = 'Residual',
title = "Residuals of the update models predicting based off days") +
theme(legend.position = 'none')
I think the next best predictors to use could be season, mnth, dteday, and weathersit. I made a couple tibbles below to show the average for each of them. season would not be the best predictor, especially since mnth holds practically the same information and is more specific. dteday and mnth would be likely be similar to the model we made with days, so making a model using them as a predictor could be redundant.
I originally thought workingday would be a good predictor. I was expecting none working days to have a lower average, but working days actually end up having about 200 more bike rentals per day. In retrospect that makes sense, as people use them to commute. Either way, I don’t think there is a significant enough difference to really be a good predictor.
day
## # A tibble: 731 × 21
## instant dteday season yr mnth holiday weekday workingday weathersit
## <dbl> <date> <dbl> <fct> <dbl> <dbl> <dbl> <fct> <fct>
## 1 1 2011-01-01 1 2011 1 0 6 weekend/hol… mist
## 2 2 2011-01-02 1 2011 1 0 0 weekend/hol… mist
## 3 3 2011-01-03 1 2011 1 0 1 work day clear
## 4 4 2011-01-04 1 2011 1 0 2 work day clear
## 5 5 2011-01-05 1 2011 1 0 3 work day clear
## 6 6 2011-01-06 1 2011 1 0 4 work day clear
## 7 7 2011-01-07 1 2011 1 0 5 work day mist
## 8 8 2011-01-08 1 2011 1 0 6 weekend/hol… mist
## 9 9 2011-01-09 1 2011 1 0 0 weekend/hol… clear
## 10 10 2011-01-10 1 2011 1 0 1 work day clear
## # … with 721 more rows, and 12 more variables: temp <dbl>, atemp <dbl>,
## # hum <dbl>, windspeed <dbl>, casual <dbl>, registered <dbl>, cnt <dbl>,
## # temp_orig <dbl>, feel_temp_orig <dbl>, hum_orig <dbl>, windspeed_org <dbl>,
## # days <dbl>
ggplot(day) +
geom_point(aes(x = temp_orig, y = cnt, col = weathersit))
str(day)
## spec_tbl_df [731 × 21] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ instant : num [1:731] 1 2 3 4 5 6 7 8 9 10 ...
## $ dteday : Date[1:731], format: "2011-01-01" "2011-01-02" ...
## $ season : num [1:731] 1 1 1 1 1 1 1 1 1 1 ...
## $ yr : Factor w/ 2 levels "2011","2012": 1 1 1 1 1 1 1 1 1 1 ...
## $ mnth : num [1:731] 1 1 1 1 1 1 1 1 1 1 ...
## $ holiday : num [1:731] 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday : num [1:731] 6 0 1 2 3 4 5 6 0 1 ...
## $ workingday : Factor w/ 2 levels "work day","weekend/holiday": 2 2 1 1 1 1 1 2 2 1 ...
## $ weathersit : Factor w/ 4 levels "clear","mist",..: 2 2 1 1 1 1 2 2 1 1 ...
## $ temp : num [1:731] 0.344 0.363 0.196 0.2 0.227 ...
## $ atemp : num [1:731] 0.364 0.354 0.189 0.212 0.229 ...
## $ hum : num [1:731] 0.806 0.696 0.437 0.59 0.437 ...
## $ windspeed : num [1:731] 0.16 0.249 0.248 0.16 0.187 ...
## $ casual : num [1:731] 331 131 120 108 82 88 148 68 54 41 ...
## $ registered : num [1:731] 654 670 1229 1454 1518 ...
## $ cnt : num [1:731] 985 801 1349 1562 1600 ...
## $ temp_orig : num [1:731] 13.42 14.18 7.66 7.8 8.85 ...
## $ feel_temp_orig: num [1:731] 18.18 17.69 9.47 10.61 11.46 ...
## $ hum_orig : num [1:731] 80.6 69.6 43.7 59 43.7 ...
## $ windspeed_org : num [1:731] 10.7 16.7 16.6 10.7 12.5 ...
## $ days : num [1:731] 0 1 2 3 4 5 6 7 8 9 ...
## - attr(*, "spec")=
## .. cols(
## .. instant = col_double(),
## .. dteday = col_date(format = ""),
## .. season = col_double(),
## .. yr = col_double(),
## .. mnth = col_double(),
## .. holiday = col_double(),
## .. weekday = col_double(),
## .. workingday = col_double(),
## .. weathersit = col_double(),
## .. temp = col_double(),
## .. atemp = col_double(),
## .. hum = col_double(),
## .. windspeed = col_double(),
## .. casual = col_double(),
## .. registered = col_double(),
## .. cnt = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
day %>%
group_by(mnth) %>%
summarise(avg = mean(cnt))
## # A tibble: 12 × 2
## mnth avg
## <dbl> <dbl>
## 1 1 2176.
## 2 2 2655.
## 3 3 3692.
## 4 4 4485.
## 5 5 5350.
## 6 6 5772.
## 7 7 5564.
## 8 8 5664.
## 9 9 5767.
## 10 10 5199.
## 11 11 4247.
## 12 12 3404.
day %>%
group_by(season) %>%
summarise(avg = mean(cnt))
## # A tibble: 4 × 2
## season avg
## <dbl> <dbl>
## 1 1 2604.
## 2 2 4992.
## 3 3 5644.
## 4 4 4728.
day %>%
group_by(weathersit) %>%
summarise(avg = mean(cnt))
## # A tibble: 3 × 2
## weathersit avg
## <fct> <dbl>
## 1 clear 4877.
## 2 mist 4036.
## 3 light precip 1803.
day %>%
group_by(workingday) %>%
summarise(avg = mean(cnt))
## # A tibble: 2 × 2
## workingday avg
## <fct> <dbl>
## 1 work day 4585.
## 2 weekend/holiday 4330.